Raw data quality control


Initialization

We start the analysis by initializing the packages required for all the analysis performed in this section. We also define the root directory, within which all the input/output operations for this project will be performed. At the end of this document, detailed software version information is provided for easier reproducibility of the analysis.


Raw data summary

We show below the summary for the raw screening data - Yeast Non-essential MATa library.

     Plate           Well_96            Well            Group       
 102    :  4512   E10    :  2496   Q13    :   156   AZ     :  2496  
 105    :  4512   E4     :  2496   Q14    :   156   BE     :  2496  
 108    :  4512   E9     :  2496   Q15    :   156   BF     :  2496  
 109    :  4512   A1     :  2448   Q16    :   156   A      :  2448  
 110    :  4512   A10    :  2448   Q33    :   156   AB     :  2448  
 112    :  4512   A11    :  2448   Q34    :   156   AD     :  2448  
 (Other):200736   (Other):212976   (Other):226872   (Other):212976  
         Content      SystematicName        SGD.ID          Gene.Symbol       
 Blank       :56952   Length:227808      Length:227808      Length:227808     
 Control     :56952   Class :character   Class :character   Class :character  
 Cytoplasm   :56952   Mode  :character   Mode  :character   Mode  :character  
 Mitochondria:56952                                                           
                                                                              
                                                                              
                                                                              
  roGFP2.ratio      Signal_485_520  Signal_400_520         Type      
 Min.   :-7491.00   Min.   :3.170   Min.   :3.819   Glucose  :75936  
 1st Qu.:    0.36   1st Qu.:4.197   1st Qu.:4.834   Galactose:75936  
 Median :    0.49   Median :4.683   Median :4.903   Glycerol :75936  
 Mean   :    0.48   Mean   :4.574   Mean   :4.918                    
 3rd Qu.:    0.64   3rd Qu.:4.926   3rd Qu.:5.012                    
 Max.   : 1122.00   Max.   :5.415   Max.   :5.415                    
 NA's   :56952                                                       

Background Fluorescence

Firstly, we look at the distribution of background flourescence across all plates and retain only those colonies for Control, Mitochondria and Cytoplasm whose fluorescence signal exceeds the 90th percentile of blank 485-520 and 400-520 fluorescence signals

tmpdat = rawDat[, c("Plate", "Content", "Signal_485_520", "Signal_400_520", "Type")]
tmpdat = melt(tmpdat)

a = droplevels(tmpdat[tmpdat$variable == "Signal_485_520" & tmpdat$Content == "Blank", 
    ])
per90_485_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_485_520 = sapply(per90_485_520, function(x) sapply(x, function(y) quantile(y$value, 
    0.9)))
per90_485_520 = melt(t(per90_485_520))
per90_485_520$Var2 = gsub(".90%", "", per90_485_520$Var2)
colnames(per90_485_520) = c("Plate", "Type", "value")

a = droplevels(tmpdat[tmpdat$variable == "Signal_400_520" & tmpdat$Content == "Blank", 
    ])
per90_400_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_400_520 = sapply(per90_400_520, function(x) sapply(x, function(y) quantile(y$value, 
    0.9)))
per90_400_520 = melt(t(per90_400_520))
per90_400_520$Var2 = gsub(".90%", "", per90_400_520$Var2)
colnames(per90_400_520) = c("Plate", "Type", "value")

# Filtering out colonies

if (identical(paste0(per90_400_520$Plate, per90_400_520$Type), paste0(per90_485_520$Plate, 
    per90_485_520$Type))) {
    cutoff = data.frame(per90_400_520[, 1:2], per90_400_520 = per90_400_520$value, 
        per90_485_520 = per90_485_520$value, stringsAsFactors = F)
}

rawDatFilt = lapply(split(rawDat, rawDat$Plate), function(x) split(x, x$Type))

for (i in 1:nrow(cutoff)) {
    x = rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]]
    x = x[x$Signal_485_520 > cutoff$per90_485_520[i] & x$Signal_400_520 > cutoff$per90_400_520[i], 
        ]
    rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]] = x
}

rawDatFilt = do.call("rbind", lapply(rawDatFilt, function(x) do.call("rbind", x)))

# Plot background fluorescence distributions
p1 = ggplot(tmpdat, aes(x = value, fill = Content)) + geom_density(alpha = 0.5) + 
    theme_bw(base_size = 8) + labs(subtitle = "Non-essential MATa screen", x = "Fluorescence  signal (in log10)", 
    y = "Density", fill = "") + facet_wrap(variable ~ Type, scales = "free", ncol = 3) + 
    theme(legend.position = "right", panel.grid = element_blank())

p2 = ggplot(tmpdat[tmpdat$variable == "Signal_485_520", ], aes(x = value, fill = Content)) + 
    geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Non-essential MATa screen | Signal_485_520", 
    x = "Fluorescence  signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_485_520, 
    aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) + 
    theme(legend.position = "right", panel.grid = element_blank())

p3 = ggplot(tmpdat[tmpdat$variable == "Signal_400_520", ], aes(x = value, fill = Content)) + 
    geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Non-essential MATa screen | Signal_400_520", 
    x = "Fluorescence  signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_400_520, 
    aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) + 
    theme(legend.position = "right", panel.grid = element_blank())

ggsave(filename = paste0(path, "analysis/QC/background_fluorescence.pdf"), plot = p1, 
    width = 7, height = 3.5)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_485_520.pdf"), 
    plot = p2, width = 7, height = 60, limitsize = F)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_400_520.pdf"), 
    plot = p3, width = 7, height = 60, limitsize = F)

p1

[1] "total number of colonies in the raw data - 227808"
[1] "total number of colonies in the background flourescence filtered data - 127459"
[1] "% of colonies KEPT - 55.95"

Cleaning raw data

Firstly we read the raw yeast redox screen data and remove outlier (extreme values) and NA. Typically, outlier removal is not considered good data analysis practice, however, we do it here because we know that the extreme values comes from technical artifacts. For example, roGFP2 ratios cannot be negative and those which are significantly higher than the plate average are ambiguous signals.

Outliers were detected as -

  • upper outlier values > 75th quantile value for a plate + 3 * plate IQR value
  • lower outlier values < 25th quantile value for a plate - 3 * plate IQR value

The outlier values are all set to NA and values greater than the lower bound threshold but less than 0 were set to pseudo minimum value of 0.0001 as roGFP2 ratios can’t be less than 0.

We show these extreme outliers below, ploting the distribution of values from each plate.

Next, lets check the summaries of the raw roGFP2 redox screen data and the cleaned raw data for comparison.

Raw data looked like -

     Plate           Well_96            Well            Group       
 104    :  3418   A12    :  1943   O48    :   152   L      :  1943  
 102    :  3257   H12    :  1934   P48    :   151   CR     :  1934  
 107    :  3242   D12    :  1867   S48    :   151   AV     :  1867  
 142    :  3103   A11    :  1866   S47    :   150   K      :  1866  
 116    :  2903   E12    :  1843   T48    :   150   BH     :  1843  
 101    :  2895   C12    :  1829   A13    :   149   AJ     :  1829  
 (Other):108641   (Other):116177   (Other):126556   (Other):116177  
         Content      SystematicName        SGD.ID          Gene.Symbol       
 Blank       : 3743   Length:127459      Length:127459      Length:127459     
 Control     :35277   Class :character   Class :character   Class :character  
 Cytoplasm   :40939   Mode  :character   Mode  :character   Mode  :character  
 Mitochondria:47500                                                           
                                                                              
                                                                              
                                                                              
  roGFP2.ratio      Signal_485_520  Signal_400_520         Type      
 Min.   :-331.890   Min.   :3.920   Min.   :4.260   Glucose  :43926  
 1st Qu.:   0.390   1st Qu.:4.710   1st Qu.:4.905   Galactose:43449  
 Median :   0.508   Median :4.894   Median :4.986   Glycerol :40084  
 Mean   :   0.529   Mean   :4.840   Mean   :4.998                    
 3rd Qu.:   0.667   3rd Qu.:5.019   3rd Qu.:5.078                    
 Max.   :  78.548   Max.   :5.415   Max.   :5.415                    
 NA's   :3743                                                        

Cleaned data looked like -

     Plate           Well_96            Well            Group       
 104    :  3333   D12    :  1736   O48    :   152   AV     :  1736  
 102    :  3162   C12    :  1710   S48    :   151   AJ     :  1710  
 107    :  3140   E12    :  1710   P48    :   150   BH     :  1710  
 142    :  3004   A12    :  1703   S47    :   150   CF     :  1703  
 116    :  2834   G12    :  1703   T48    :   150   L      :  1703  
 118    :  2833   H12    :  1698   A13    :   149   CR     :  1698  
 (Other):104714   (Other):112760   (Other):122118   (Other):112760  
         Content      SystematicName        SGD.ID          Gene.Symbol       
 Control     :35133   Length:123020      Length:123020      Length:123020     
 Cytoplasm   :40772   Class :character   Class :character   Class :character  
 Mitochondria:47115   Mode  :character   Mode  :character   Mode  :character  
                                                                              
                                                                              
                                                                              
                                                                              
  roGFP2.ratio    Signal_485_520  Signal_400_520         Type      
 Min.   :0.0001   Min.   :3.938   Min.   :4.725   Glucose  :42333  
 1st Qu.:0.3907   1st Qu.:4.733   1st Qu.:4.906   Galactose:41968  
 Median :0.5079   Median :4.902   Median :4.988   Glycerol :38719  
 Mean   :0.5356   Mean   :4.858   Mean   :5.000                    
 3rd Qu.:0.6647   3rd Qu.:5.023   3rd Qu.:5.079                    
 Max.   :5.1000   Max.   :5.415   Max.   :5.415                    
                                                                   

Percentage of data removed after outlier removal -

[1] 3.48

Identification of edge effects

A well known technical issue with yeast mutant screens is the altered growth characteristics of yeast colonies growing at the edge of plates. Next, we look at how roGFP2 ratios are different between edge wells and non-edge wells.

These are the edge wells and the mutants originating from these locations should be carefully analyzed.

  [1] "I01" "Q01" "Y01" "A03" "D02" "F04" "I03" "L02" "N04" "Q03" "T02" "V04"
 [13] "Y03" "b02" "d04" "A09" "A17" "A25" "A33" "A41" "B05" "B13" "B21" "B29"
 [25] "B37" "B45" "C09" "C17" "C25" "C33" "C41" "D05" "D13" "D21" "D29" "D37"
 [37] "D45" "F45" "H45" "J45" "L45" "N45" "P45" "R45" "T45" "V45" "X45" "Z45"
 [49] "b05" "b13" "b21" "b29" "b37" "b45" "c09" "c17" "c25" "c33" "c41" "d05"
 [61] "d13" "d21" "d29" "d37" "d45" "e09" "e17" "e25" "e33" "e41" "J01" "R01"
 [73] "Z01" "A04" "D03" "G02" "I04" "L03" "O02" "Q04" "T03" "W02" "Y04" "b03"
 [85] "e02" "A10" "A18" "A26" "A34" "A42" "B06" "B14" "B22" "B30" "B38" "B46"
 [97] "C10" "C18" "C26" "C34" "C42" "D06" "D14" "D22" "D30" "D38" "D46" "F46"
[109] "H46" "J46" "L46" "N46" "P46" "R46" "T46" "V46" "X46" "Z46" "b06" "b14"
[121] "b22" "b30" "b38" "b46" "c10" "c18" "c26" "c34" "c42" "d06" "d14" "d22"
[133] "d30" "d38" "d46" "e10" "e18" "e26" "e34" "e42" "K01" "S01" "a01" "B02"
[145] "D04" "G03" "J02" "L04" "O03" "R02" "T04" "W03" "Z02" "b04" "e03" "A11"
[157] "A19" "A27" "A35" "A43" "B07" "B15" "B23" "B31" "B39" "B47" "C11" "C19"
[169] "C27" "C35" "C43" "D07" "D15" "D23" "D31" "D39" "D47" "F47" "H47" "J47"
[181] "L47" "N47" "P47" "R47" "T47" "V47" "X47" "Z47" "b07" "b15" "b23" "b31"
[193] "b39" "b47" "c11" "c19" "c27" "c35" "c43" "d07" "d15" "d23" "d31" "d39"
[205] "d47" "e11" "e19" "e27" "e35" "e43" "L01" "T01" "b01" "B03" "E02" "G04"
[217] "J03" "M02" "O04" "R03" "U02" "W04" "Z03" "c02" "e04" "A12" "A20" "A28"
[229] "A36" "A44" "B08" "B16" "B24" "B32" "B40" "B48" "C12" "C20" "C28" "C36"
[241] "C44" "D08" "D16" "D24" "D32" "D40" "D48" "F48" "H48" "J48" "L48" "N48"
[253] "P48" "R48" "T48" "V48" "X48" "Z48" "b08" "b16" "b24" "b32" "b40" "b48"
[265] "c12" "c20" "c28" "c36" "c44" "d08" "d16" "d24" "d32" "d40" "d48" "e12"
[277] "e20" "e28" "e36" "e44" "M01" "U01" "c01" "B04" "E03" "H02" "J04" "M03"
[289] "P02" "R04" "U03" "X02" "Z04" "c03" "A05" "A13" "A21" "A29" "A37" "A45"
[301] "B09" "B17" "B25" "B33" "B41" "C05" "C13" "C21" "C29" "C37" "C45" "D09"
[313] "D17" "D25" "D33" "D41" "E45" "G45" "I45" "K45" "M45" "O45" "Q45" "S45"
[325] "U45" "W45" "Y45" "a45" "b09" "b17" "b25" "b33" "b41" "c05" "c13" "c21"
[337] "c29" "c37" "c45" "d09" "d17" "d25" "d33" "d41" "e05" "e13" "e21" "e29"
[349] "e37" "e45" "N01" "V01" "d01" "C02" "E04" "H03" "K02" "M04" "P03" "S02"
[361] "U04" "X03" "a02" "c04" "A06" "A14" "A22" "A30" "A38" "A46" "B10" "B18"
[373] "B26" "B34" "B42" "C06" "C14" "C22" "C30" "C38" "C46" "D10" "D18" "D26"
[385] "D34" "D42" "E46" "G46" "I46" "K46" "M46" "O46" "Q46" "S46" "U46" "W46"
[397] "Y46" "a46" "b10" "b18" "b26" "b34" "b42" "c06" "c14" "c22" "c30" "c38"
[409] "c46" "d10" "d18" "d26" "d34" "d42" "e06" "e14" "e22" "e30" "e38" "e46"
[421] "O01" "W01" "e01" "C03" "F02" "H04" "K03" "N02" "P04" "S03" "V02" "X04"
[433] "a03" "d02" "A07" "A15" "A23" "A31" "A39" "A47" "B11" "B19" "B27" "B35"
[445] "B43" "C07" "C15" "C23" "C31" "C39" "C47" "D11" "D19" "D27" "D35" "D43"
[457] "D47" "G47" "I47" "K47" "M47" "O47" "Q47" "S47" "U47" "W47" "Y47" "a47"
[469] "b11" "b19" "b27" "b35" "b43" "c07" "c15" "c23" "c31" "c39" "c47" "d11"
[481] "d19" "d27" "d35" "d43" "e07" "e15" "e23" "e31" "e39" "e47" "P01" "X01"
[493] "A02" "C04" "F03" "I02" "K04" "N03" "Q02" "S04" "V03" "Y02" "a04" "d03"
[505] "A08" "A16" "A24" "A32" "A40" "A48" "B12" "B20" "B28" "B36" "B44" "C08"
[517] "C16" "C24" "C32" "C40" "C48" "D12" "D20" "D28" "D36" "D44" "D48" "G48"
[529] "I48" "K48" "M48" "O48" "Q48" "S48" "U48" "W48" "Y48" "a48" "b12" "b20"
[541] "b28" "b36" "b44" "c08" "c16" "c24" "c32" "c40" "c48" "d12" "d20" "d28"
[553] "d36" "d44" "e08" "e16" "e24" "e32" "e40" "e48"

Visulaizing the differences using both effect size (Fold change) and pvalue.

For none of the plates and conditions, the edge effects exceed the log fold change of +/- 1


Variance among controls

Next we look into the variance exhibited by the control colonies across plates and nutrient conditions. Ideally, all control replicates for a given mutant in a given plate and nutrient condition should have similar values i.e variance close to zero. Below we show the variance of raw roGFP2 ratios among controls per plate and group.

Next we remove all genes with variance among controls > 0.05

[1] "Number of poor control genes removed - 77"

Next we we visualize the fact that in a good screen, individual control values should not deviate much from the median value of all controls from that same plate. For a given plate \(i\), the normalized control values i.e \(NormCtrl\) is given as - \[NormCtrl_i = \frac{Ctrl_i} {median(Ctrl_i)}\] We plot these scaled control values per plate across conditions below. Ideally these values should be close to 1 (shown as red horizontal lines). A table for these scaled control values is also provided.

ctrlDat = split(rawDatCleaned_ctrlfilt, rawDatCleaned_ctrlfilt$Type)
ctrlDat = lapply(ctrlDat, function(x) {
    split(x, x$Plate)
})

for (i in 1:length(ctrlDat)) {
    for (j in 1:length(ctrlDat[[i]])) {
        tmp = droplevels(ctrlDat[[i]][[j]])
        tmp = tmp[, -2]
        
        # Plate control
        tmp.ctrl = tmp[tmp$Content == "Control", ]
        
        # Normalizing factor
        nf = median(tmp.ctrl$roGFP2.ratio, na.rm = T)
        
        # Measure control value deviation from plate median
        ctrlDat[[i]][[j]] = data.frame(Gene = tmp.ctrl$Gene.Symbol, NormControl = round(log2(tmp.ctrl$roGFP2.ratio/nf), 
            3), stringsAsFactors = F)
        # Deleting
        rm(tmp, tmp.ctrl, nf)
    }
    rm(j)
}
rm(i)

ctrlDat = melt(ctrlDat, id = "Gene")
ctrlDat = ctrlDat[, c(4, 5, 1, 3)]
colnames(ctrlDat) = c("Plate", "Nutrient", "Gene", "NormControl")
ctrlDat$Plate = as.factor(ctrlDat$Plate)
ctrlDat$Nutrient = factor(ctrlDat$Nutrient, levels = c("Glucose", "Galactose", "Glycerol"))

# datatable(ctrlDat, rownames = FALSE, filter='top', class='compact', extensions
# = c('Buttons') , options = list(autoWidth = TRUE, dom = 'Blfrtip', buttons =
# c('csv', 'excel') ))

p = ggplot(ctrlDat, aes(x = Plate, y = NormControl)) + theme_bw(base_size = 8) + 
    labs(y = "Control / Median plate control (roGFP2 ratios) in log2 scale") + geom_boxplot(outlier.size = 0.1, 
    lwd = 0.2) + facet_wrap(~Nutrient) + geom_hline(yintercept = 3, color = "orange") + 
    geom_hline(yintercept = -3, color = "orange") + # geom_text_repel(data = subset(ctrlDat, NormControl > 3), aes(x = Plate, y =
# NormControl, label = Gene), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))

p

Next we remove those controls that deviate 3 log fold units over the plate control median value

[1] "Number of control genes removed which deviated 3 log fold units from the plate median - 468"

Plotting these scaled control values -

A more standard way of calculating the variance among the controls is by computing the robust coefficient of variance see here and here. For a given plate \(i\), this is given as - \[CoefVar_i = \frac{mad(Ctrl_i)}{median(Ctrl_i)}\] mad is Median absolute deviation

Plotting these robust coefficient of variation for control values-


Saving the data

Finally we save the cleaned raw Data as a .RDS (R data object), which will be used for the next step of normalization

Session information

R version 3.6.2 (2019-12-12)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS  10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggallin_0.1.1    WriteXLS_5.0.0   DT_0.12          plotly_4.9.2    
 [5] lsr_0.5          ggpubr_0.2.4     magrittr_1.5     ggrepel_0.8.1   
 [9] ggplot2_3.2.1    reshape2_1.4.3   rmdformats_0.3.6 knitr_1.28      

loaded via a namespace (and not attached):
 [1] tidyselect_1.0.0   xfun_0.12          purrr_0.3.3        colorspace_1.4-1  
 [5] vctrs_0.2.2        htmltools_0.4.0    viridisLite_0.3.0  yaml_2.2.1        
 [9] rlang_0.4.4        pillar_1.4.3       later_1.0.0        glue_1.3.1        
[13] withr_2.1.2        RColorBrewer_1.1-2 lifecycle_0.1.0    plyr_1.8.5        
[17] stringr_1.4.0      munsell_0.5.0      ggsignif_0.6.0     gtable_0.3.0      
[21] htmlwidgets_1.5.1  evaluate_0.14      labeling_0.3       fastmap_1.0.1     
[25] Cairo_1.5-11       httpuv_1.5.5       crosstalk_1.0.0    Rcpp_1.0.3        
[29] xtable_1.8-4       promises_1.1.0     scales_1.1.0       formatR_1.7       
[33] jsonlite_1.6.1     mime_0.9           farver_2.0.3       digest_0.6.23     
[37] stringi_1.4.5      bookdown_0.17      dplyr_0.8.4        shiny_1.4.0       
[41] grid_3.6.2         tools_3.6.2        lazyeval_0.2.2     tibble_2.1.3      
[45] crayon_1.3.4       tidyr_1.0.2        pkgconfig_2.0.3    ellipsis_0.3.0    
[49] data.table_1.12.8  assertthat_0.2.1   rmarkdown_2.1      httr_1.4.1        
[53] R6_2.4.1           compiler_3.6.2    

Ashwini Kumar Sharma, PhD

2021-06-18